#!/usr/bin/env python
# coding: utf8
# -*- coding: utf-8 -*-
# Quelle: https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html
'''
NLTK, a natural language toolkit for Python. A useful package for any natural language processing.
For Mac/Unix with pip: $ sudo pip install -U nltk
stop_words, a Python package containing stop words.
For Mac/Unix with pip: $ sudo pip install stop-words
gensim, a topic modeling package containing our LDA model.
For Mac/Unix with pip: $ sudo pip install gensim
'''
import os
os.environ["PATH"] = "/usr/local/lib/python2.7/dist-packages/gensim#"
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import sys
import codecs
sys.path.append("/usr/local/lib/python2.7/")
def deumlaut(s):
"""
Replaces umlauts with fake-umlauts
"""
s = s.replace('\xdf', 'ss')
s = s.replace('\xfc', 'ü')
s = s.replace('\xdc', 'Ü')
s = s.replace('\xf6', 'ö')
s = s.replace('\xd6', 'Ö')
s = s.replace('\xe4', 'ä')
s = s.replace('\xc4', 'Ä')
s = s.replace('ö', 'oe')
s = s.replace('ä', 'ae')
s = s.replace('ü', 'ue')
s = s.replace('Ü', 'Ue')
s = s.replace('Ö', 'Oe')
s = s.replace('Ä', 'Ae')
#s = s.replace('\xdf', 'ss')
#s = s.replace('\xfc', 'ue')
#s = s.replace('\xdc', 'Ue')
#s = s.replace('\xf6', 'oe')
#s = s.replace('\xd6', 'Oe')
#s = s.replace('\xe4', 'ae')
# s = s.replace('\xc4', 'Ae')
return s
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
de_stop = get_stop_words('german')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
filename = sys.argv[1]
f = codecs.open(filename, "r", "utf-8")
text_unicode = f.read().encode("utf-8")
text_unicode = deumlaut(text_unicode)
'''
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."
test_text = "Ein rosa Kleidchen, ein farblich passendes Strohhütchen auf dem Kopf, ein breites Lächeln: Carmen Geiss " \
"(50), wie man sie kennt. Für dieses Foto kassiert die Kölner Kult-Millionärin gerade allerdings einen " \
"üblen Shitstorm. Der Grund: Die Urlauberpose hat Carmen in einem kolumbianischen Armenviertel aufgenommen. Dazu die Facebook-Erklärung: „HEUTE GEHT ES MAL IN DIE SLUMS VON CARTAGENA“ Neben Carmens schrillem Outfit sorgt auch die Anreise der Geissens für Empörung: Die Millionärs-Familie legt im Luxus-Bötchen, das den Namen „Roberto Geissini“ trägt, im Hafen der Armen an."
'''
#text_unicode = test_text.decode('utf-8')
doc_set = sent_tokenize(text_unicode.decode("utf8"))
# compile sample documents into a list
#doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
# list for tokenized documents in loop
texts = []
# loop through document list
for i in doc_set:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in de_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=128, alpha='auto',
eval_every=5)
'''
https://radimrehurek.com/gensim/models/ldamodel.html
LDA module können trainiert werden - YEEEEEESSS
'''
print("
LDAPROFILER OUTPUT: ")
print(ldamodel.print_topics(num_topics=2, num_words=2))